/*========================== begin_copyright_notice ============================

Copyright (C) 2017 Intel Corporation

SPDX-License-Identifier: MIT

============================= end_copyright_notice ===========================*/


#include "Compiler/CISACodeGen/VectorProcess.hpp"
#include "Compiler/CISACodeGen/ShaderCodeGen.hpp"
#include "Compiler/CISACodeGen/EmitVISAPass.hpp"
#include "Compiler/IGCPassSupport.h"
#include "common/IGCIRBuilder.h"
#include "common/LLVMWarningsPush.hpp"
#include "llvmWrapper/Support/Alignment.h"
#include "llvmWrapper/IR/DerivedTypes.h"
#include <llvm/IR/DataLayout.h>
#include <llvm/IR/Instructions.h>
#include <llvm/IR/IRBuilder.h>
#include <llvm/IR/InstIterator.h>
#include "common/LLVMWarningsPop.hpp"
#include "Probe/Assertion.h"

using namespace llvm;
using namespace IGC;
using IGCLLVM::FixedVectorType;

//
// Description of VectorProcess Pass
//   The pass is to do data layout of vector explicitly by inserting bitcasts.
//   These bitcasts have special meaning and cannot be deleted. We insert
//   those bitcasts right before emitting vISA code so that the most codegen
//   passes will not need to special-handle those bitcasts.
//
// As we assume that vector type (in llvm ir) is in a "packed form", which means
// that when we group several workitems (each llvm code is a single workitem)
// into a single thread, the elements of a vector in LLVM IR are no longer
// consecutive in their GRF. For example, given <n x T> v,  its vISA
// variable under SIMD8 (group 8 workitems into a single thread) will be
// laid out as follow (For readability, C variables are used and C's struct
// layout is assumed):
//     struct { T c0, c1, c2, c3, c4, c5, c6, c7 } visaVar[n];
//     where c0, c1, ... c7 represent values for simd lane 0 -- 7,
//     respectively. For example, assume the original workitem 0 is at SIMD
//     lane 0, and its vector v for lane 0 will be
//       visaVar[0].c0, visaVar[1].c0, visaVar[2].c0,...... visaVar[n-1].c0,
//     which are no longer consecutive in visaVar.
//
// This layout is not guaranteed to be efficiently generated by gathers/scatters.
// For example,  <16xi8> can be generated by 16 1-byte byte scattered Reads, each
// read reads 1 byte for every lane;  but <16xi8> can be viewed as <4xi32>. And
// a single gather4 can get entire <4xi32>. Thus, to have an efficient message,
// the original vector could be "re-layout" to a different vector type that can
// be mapped to send message more efficently. But this "re-layout" has cost,
// that is, we will have to generate mov instructions (maybe a lot), as shown
// below:
//    <16xi8> v
//       struct { i8 c0, c1, ..., c7 } visaVar_v[16];
//       Note: this array of struct is required in IGC (referred to as
//             packed form).
//
//    <4xi32> v_as4xi32
//       struct { i32 c0, c1, ..., c7 } visaVar_v_as4xi32[4]; or
//          struct { i8 c0[4], c1[4], ..., c7[4] } visaVar_v_as4xi32[4];
//          note: each element of the array is actually a struct of array!
//       visaVar_v_as4xi32 = gather4 &v
//
//
//    To convert <4xi32> back to <16xi8> (required as packed-form), the
//    following is needed:
//       for(i=0; i < 4; ++i)
//         for(j=0; j < 4; ++j)
//            visaVar_v[i*4 + j].c0 = visaVar_v_as4xi32[i].c0[j];
//            ......
//            visaVar_v[i*4 + j].c7 = visaVar_v_as4xi32[i].c7[j];
//    and this has 4 * 4 * 8 = 128 mov instructions !
//
//
// In order to generate such mov instructions explicitly, we insert bitcast between
// the original vector and one we want to use for load and store, and this bitcast
// basically emits movs similar to the conversion code as shown above.  We call
// this bitcast as re-data-layout. The following is the code generated for this
// explicit bitcast (done by emitVectorBitCast):
//     before:   %v = load <16xi8>* p
//
//     after:    %np = bitcast p to <4 x i32>*
//               %nv = load <4 x i32>* np
//               %v  = bitcast nv to <16 x i8>       <<--- re-data-layout bitcast
//
// Since this could potentially generate a lot of movs (may be optimized away),
// bitcasts are inserted only if it is needed.
//
// ** Note, we guarantee that the size of a vector is either 1, 2 bytes,
// ** or multiple of DW at this point. This is guaranteed by VectorPreProcess
// ** (as <3 x i8> cannot be mapped to a single send message, has to be
// ** splitted. We split <3 x i8> in VectorPreProcess so that we don't have
// ** to worry about splitting vector here).
//
// Given a vector < n x T>, the type of load/store is calculated "conceptually"
// as the following, note that if sizeof(T) is 4 or 8, we normally do not
// need to do conversion at all (but there are exception when load/store is
// is mis-aligned). (Keep in mind that sizeof(T)*n is 1|2|multiple-of-DW.)
//    if (n * sizeof(T) < 4 bytes) {
//      <n x T> ---> S; where S is the scalar type whose size == n * sizeof(T);
//    } else if ( (sizeof(T) != 4 && Using A32 message ) ||
//                (sizeof(T) != 4|8 && Using A64 message) ) {
//
//      <n x T>  -->  <n1 x i64>  : sizeof(T) == 8 && A64 messages; or
//                    <n1 x i32>  : otherwise
//    }
//
// For example,
//  (1)   %1 = load <8 x i16> *p
//        converted into
//          new_p = bitcast p to <4 x i32>*
//          %2    = load <4 x i32> *new_p
//          %1    = bitcast %2 to <8 x i16>
//
//  (2)   %1 = load <4 x i64> *p
//        Using A32, converted into
//          new_p = bitcast p to <8 x i32>*
//          %2 = load <8 x i32> *new_p
//          %1 = bitcast %2 to <4 x i64>
//
//        Using A64, do nothing.
//
namespace
{
    class VectorProcess : public FunctionPass
    {
    public:
        typedef SmallVector<Instruction*, 32> InstWorkVector;

        static char ID; // Pass identification, replacement for typeid
        VectorProcess()
            : FunctionPass(ID)
            , m_DL(nullptr)
            , m_C(nullptr)
            , has_8Byte_A64_BS(true)
            , has_QW_BTS_GS(false)
            , m_WorkList()
        {
            initializeVectorProcessPass(*PassRegistry::getPassRegistry());
        }
        StringRef getPassName() const override { return "VectorProcess"; }
        bool runOnFunction(Function& F) override;
        void getAnalysisUsage(AnalysisUsage& AU) const override
        {
            AU.setPreservesCFG();
            AU.addRequired<CodeGenContextWrapper>();
        }

    private:
        bool reLayoutLoadStore(Instruction* Inst);
        bool optimizeBitCast(BitCastInst* BC);
        Value* ProcessMergeValue(Instruction *Inst, Value* V, Type* NewTy,
                                 Type* NewIntETy, Type* NewIntTy) const;

    private:
        const DataLayout* m_DL;
        LLVMContext* m_C;
        bool has_8Byte_A64_BS; // true if 8-byte A64 Byte scattered is supported
        bool has_QW_BTS_GS;    // true if qword BTS Gather/Scatter is supported
        InstWorkVector m_WorkList;
    };
}

// Register pass to igc-opt
#define PASS_FLAG "igc-vectorprocess"
#define PASS_DESCRIPTION "Process vector loads/stores for explicit vISA variable layout"
#define PASS_CFG_ONLY false
#define PASS_ANALYSIS false
IGC_INITIALIZE_PASS_BEGIN(VectorProcess, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
IGC_INITIALIZE_PASS_DEPENDENCY(CodeGenContextWrapper)
IGC_INITIALIZE_PASS_END(VectorProcess, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)

char VectorProcess::ID = 0;

FunctionPass* IGC::createVectorProcessPass()
{
    return new VectorProcess();
}

bool VectorProcess::reLayoutLoadStore(Instruction* Inst)
{
    LoadInst* const LI = dyn_cast<LoadInst>(Inst);
    StoreInst* const SI = dyn_cast<StoreInst>(Inst);
    GenIntrinsicInst* const II = dyn_cast<GenIntrinsicInst>(Inst);

    Value* Ptr = nullptr;
    Type* Ty = nullptr;
    if (nullptr != LI)
    {
        Ptr = LI->getPointerOperand();
        Ty = LI->getType();
    }
    else if (nullptr != SI)
    {
        IGC_ASSERT(0 < SI->getNumOperands());
        IGC_ASSERT(nullptr != SI->getOperand(0));

        Ptr = SI->getPointerOperand();
        Ty = SI->getOperand(0)->getType();
    }
    else
    {
        IGC_ASSERT(nullptr != II);
        IGC_ASSERT(0 < II->getNumOperands());
        IGC_ASSERT(nullptr != II->getOperand(0));

        Ptr = II->getOperand(0);

        if (II->getIntrinsicID() == GenISAIntrinsic::GenISA_ldrawvector_indexed ||
            II->getIntrinsicID() == GenISAIntrinsic::GenISA_ldraw_indexed ||
            II->getIntrinsicID() == GenISAIntrinsic::GenISA_PredicatedLoad)
        {
            Ty = II->getType();
        }
        else if (II->getIntrinsicID() == GenISAIntrinsic::GenISA_storerawvector_indexed ||
                 II->getIntrinsicID() == GenISAIntrinsic::GenISA_storeraw_indexed)
        {
            IGC_ASSERT(2 < IGCLLVM::getNumArgOperands(II));
            IGC_ASSERT(nullptr != II->getArgOperand(2));

            Ty = II->getArgOperand(2)->getType();
        }
        else if (II->getIntrinsicID() == GenISAIntrinsic::GenISA_PredicatedStore)
        {
            IGC_ASSERT(1 < IGCLLVM::getNumArgOperands(II));
            IGC_ASSERT(nullptr != II->getArgOperand(1));

            Ty = II->getArgOperand(1)->getType();
        }
        else
        {
            IGC_ASSERT_MESSAGE(0, "Internal Error: unknown intrinsic");
        }
    }

    IGC_ASSERT(nullptr != Ptr);
    IGC_ASSERT(nullptr != Ty);

    IGCLLVM::FixedVectorType* const VTy = dyn_cast<IGCLLVM::FixedVectorType>(Ty);

    // Treat a scalar as 1-element vector
    uint32_t nelts = VTy ? int_cast<uint32_t>(VTy->getNumElements()) : 1;
    Type* eTy = VTy ? VTy->getElementType() : Ty;
    uint32_t eTyBits = int_cast<unsigned int>(m_DL->getTypeSizeInBits(eTy));

    IGC_ASSERT_MESSAGE((eTyBits == 8 || eTyBits == 16 || eTyBits == 32 || eTyBits == 64), "the Size of Vector element must be 8/16/32/64 bits.");

    uint32_t eTyBytes = (eTyBits >> 3);
    uint32_t TBytes = nelts * eTyBytes;  // Total size in bytes

    //
    // Assumption:
    //    1. if the size of vector < 4 bytes, it must be 1 or 2 bytes (never 3);
    //    2. if the size of vector >= 4 bytes, it must be multiple of DW
    // Those 2 assumption are guaranteed by VectorPreProcess.
    //
    // So far, we are using A32 untyped and byte scattered messages,
    // and A64 scattered messages and A64 untyped messages.
    //
    // A32: using DW as the new element type.
    // A64: the new element type will be:
    //        unaligned load/store: DW if no 8-byte A64 byte scattered message
    //                              QW otherwise;
    //        aligned vector of long type:  use QW
    //        others: use DW.
    // For vector whose size is smaller than 4 bytes, they must be converted
    // to a 1-element vector (or scalar) so all elements are read/written with
    // a single message.
    //
    Type* new_eTy;
    uint32_t new_nelts;
    PointerType* PtrTy = cast<PointerType>(Ptr->getType());

    if (TBytes == 1)
    {
        IGC_ASSERT_MESSAGE(nelts == 1, "Internal Error: something wrong");
        return false;
    }
    else if (TBytes == 2 || TBytes == 4)
    {
        if (nelts == 1)
        {
            // No conversion needed.
            return false;
        }
        new_nelts = 1;
        new_eTy = (TBytes == 2) ? Type::getInt16Ty(*m_C)
            : Type::getInt32Ty(*m_C);
    }
    else
    {
        // This handles all the other cases
        CodeGenContext* cgCtx = nullptr;
        cgCtx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
        bool useA64 = IGC::isA64Ptr(PtrTy, cgCtx);
        bool useBSS = IGC::DecodeBufferType(PtrTy->getAddressSpace()) == IGC::BINDLESS;
        alignment_t align;
        if (LI)
        {
            align = IGCLLVM::getAlignmentValue(LI);
        }
        else if (SI)
        {
            align = IGCLLVM::getAlignmentValue(SI);
        }
        else if (II && II->getIntrinsicID() == GenISAIntrinsic::GenISA_PredicatedLoad)
        {
            align = cast<ConstantInt>(II->getArgOperand(1))->getZExtValue();
        }
        else if (II && II->getIntrinsicID() == GenISAIntrinsic::GenISA_PredicatedStore)
        {
            align = cast<ConstantInt>(II->getArgOperand(2))->getZExtValue();
        }
        else
        {
            align = 1;
        }

        bool useQW = false;
        if (useA64)
        {
            useQW = (TBytes % 8 == 0) &&
                ((has_8Byte_A64_BS && align < 4) || (eTyBytes == 8U && align >= 8U));
        }
        else if (useBSS)
        {
            useQW = has_QW_BTS_GS && nelts == 1 && (eTyBytes == 8U && align >= 8U);
        }

        if (EmitPass::shouldGenerateLSCQuery(*cgCtx, Inst) == Tristate::True)
        {
            // With LSC, want to use QW if element size is 8 bytes.
            useQW = (eTyBytes == 8);
        }

        const uint32_t new_eTyBytes = useQW ? 8 : 4;
        if (eTyBytes == new_eTyBytes && !eTy->isAggregateType())
        {
            // The original vector is already a good one. Skip.
            return false;
        }
        new_eTy = useQW ? Type::getInt64Ty(*m_C) : Type::getInt32Ty(*m_C);
        IGC_ASSERT(new_eTyBytes);
        IGC_ASSERT_MESSAGE((TBytes % new_eTyBytes) == 0, "Wrong new vector size");
        new_nelts = TBytes / new_eTyBytes;
    }

    IGCIRBuilder<> Builder(Inst);
    Type* newVTy;
    if (new_nelts == 1)
    {
        newVTy = new_eTy;
    }
    else
    {
        newVTy = FixedVectorType::get(new_eTy, new_nelts);
    }
    Type* newPtrTy = PointerType::get(newVTy, PtrTy->getPointerAddressSpace());
    Value* newPtr;
    if (IntToPtrInst * i2p = dyn_cast<IntToPtrInst>(Ptr))
    {
        newPtr = Builder.CreateIntToPtr(i2p->getOperand(0), newPtrTy, "IntToPtr2");
    }
    else
    {
        newPtr = Builder.CreateBitCast(Ptr, newPtrTy, "vptrcast");
    }

    // These types are needed when we are dealing with pointers
    // and using ptrtoint and inttoptr.
    Type* int_eTy = Type::getIntNTy(*m_C, eTyBits);
    Type* new_intTy = VTy ? FixedVectorType::get(int_eTy, nelts) : int_eTy;

    if (LI || (II && II->getIntrinsicID() == GenISAIntrinsic::GenISA_PredicatedLoad))
    {
        Instruction* oldLoad = LI ? cast<Instruction>(LI) : cast<Instruction>(II);
        Instruction* load;
        if (LI) {
          load = Builder.CreateAlignedLoad(newVTy, newPtr,
            IGCLLVM::getCorrectAlign(IGCLLVM::getAlignmentValue(LI)),
            LI->isVolatile(),
            "vCastload");
        } else {
            Type* types[] =
            {
                newVTy,
                newPtrTy,
                newVTy
            };

            Function* F = GenISAIntrinsic::getDeclaration(
                II->getParent()->getParent()->getParent(),
                GenISAIntrinsic::GenISA_PredicatedLoad,
                types);
            load = Builder.CreateCall4(F, newPtr, II->getOperand(1), II->getOperand(2),
                                       ProcessMergeValue(Inst, II->getOperand(3), newVTy, int_eTy, new_intTy));
        }
        load->copyMetadata(*oldLoad);

        Value* V = load;

        if (eTy->isPointerTy())
        {
            // cannot bitcast int to ptr; need to use intToptr.
            // First, cast the loaded value to a vector type that is same to
            //        the original vector type with ptr element type replaced
            //        with int-element type.
            // second, IntToPtr cast to the original vector type.
            V = Builder.CreateBitCast(V, new_intTy);
            if (VTy)
            {
                // If we need a vector inttoptr, scalarize it here.
                auto* BC = V;
                V = UndefValue::get(Ty);
                for (unsigned i = 0; i < nelts; i++)
                {
                    auto* EE = Builder.CreateExtractElement(BC, i);
                    auto* ITP = Builder.CreateIntToPtr(EE, eTy);
                    V = Builder.CreateInsertElement(V, ITP, i);
                }
            }
            else
            {
                V = Builder.CreateIntToPtr(V, Ty);
            }
        }
        else
        {
            // TODO: if Ty is Aggregate type then this bitCast conradicts to LLVM spec
            V = Builder.CreateBitCast(V, Ty);
        }
        oldLoad->replaceAllUsesWith(V);
        oldLoad->eraseFromParent();
    }
    else
        if (SI || (II && II->getIntrinsicID() == GenISAIntrinsic::GenISA_PredicatedStore))
        {
            Instruction *oldStore = SI ? cast<Instruction>(SI) : cast<Instruction>(II);
            Value* StoreVal = SI ? SI->getValueOperand() : II->getArgOperand(1);
            Value* V;
            if (eTy->isPointerTy())
            {

                // Similar to the load. First, PtrtoInt cast to a new vector,
                // and then bitcast to the stored type.
                Type* int_eTy = Type::getIntNTy(*m_C, eTyBits);
                if (VTy)
                {
                    // If we need a vector inttoptr, scalarize it here.
                    V = UndefValue::get(FixedVectorType::get(int_eTy, nelts));
                    for (unsigned i = 0; i < nelts; i++)
                    {
                        auto* EE = Builder.CreateExtractElement(StoreVal, i);
                        auto* ITP = Builder.CreatePtrToInt(EE, int_eTy);
                        V = Builder.CreateInsertElement(V, ITP, i);
                    }
                }
                else if (isa<IntToPtrInst>(StoreVal) &&
                    cast<IntToPtrInst>(StoreVal)->getOperand(0)->getType() == int_eTy)
                {
                    // Detect case when creating PtrToInt and BitCast instructions
                    // is not needed. This is when store value is created from
                    // a vector with the same type as the target vector type.
                    //
                    // e.g. example from a Vulkan shader with variable pointers:
                    // Before:
                    //     %7 = bitcast <2 x i32> %assembled.vect7 to i64
                    //     %Temp-26.i.VP = inttoptr i64 %7 to i32 addrspace(1179648)*
                    //     store i32 addrspace(1179648)* %Temp-26.i.VP, i32 addrspace(1179648)** %6, align 8
                    // After:
                    //     store <2 x i32> %assembled.vect7, <2 x i32>* %vptrcast, align 8

                    V = cast<IntToPtrInst>(StoreVal)->getOperand(0);
                }
                else
                {
                    V = Builder.CreatePtrToInt(StoreVal, int_eTy);
                }

                if (isa<BitCastInst>(V) &&
                    (cast<BitCastInst>(V)->getOperand(0)->getType() == newVTy))
                {
                    V = cast<BitCastInst>(V)->getOperand(0);
                }
                else
                {
                    V = Builder.CreateBitCast(V, newVTy);
                }
            }
            else
            {
                V = Builder.CreateBitCast(StoreVal, newVTy);
            }

            Instruction* store = nullptr;
            if (SI && IGCLLVM::getAlignmentValue(SI) == 0)
            {
                store = Builder.CreateStore(V, newPtr, SI->isVolatile());
            }
            else if (SI)
            {
                store = Builder.CreateAlignedStore(V, newPtr, IGCLLVM::getAlign(*SI), SI->isVolatile());
            }
            else
            {
                Type* types[] =
                {
                    newPtrTy,
                    newVTy
                };

                Function* F = GenISAIntrinsic::getDeclaration(
                    II->getParent()->getParent()->getParent(),
                    GenISAIntrinsic::GenISA_PredicatedStore,
                    types);
                store = Builder.CreateCall4(F, newPtr, V, II->getOperand(2), II->getOperand(3));
            }
            store->copyMetadata(*oldStore);
            oldStore->eraseFromParent();
        }
        else if (II->getIntrinsicID() == GenISAIntrinsic::GenISA_ldrawvector_indexed ||
                 II->getIntrinsicID() == GenISAIntrinsic::GenISA_ldraw_indexed)
        {
            Type* types[] =
            {
                newVTy,
                newPtrTy
            };

            Function* F = GenISAIntrinsic::getDeclaration(
                II->getParent()->getParent()->getParent(),
                GenISAIntrinsic::GenISA_ldrawvector_indexed,
                types);
            Value* V = Builder.CreateCall4(F, newPtr, II->getOperand(1), II->getOperand(2), II->getOperand(3));

            if (eTy->isPointerTy())
            {
                Type* intETy = Type::getIntNTy(*m_C, eTyBits);
                Type* newIntTy = VTy ? IGCLLVM::FixedVectorType::get(intETy, nelts) : intETy;
                V = Builder.CreateBitCast(V, newIntTy);
                V = Builder.CreateIntToPtr(V, Ty);
            }
            else
            {
                V = Builder.CreateBitCast(V, Ty);
            }

            II->replaceAllUsesWith(V);
            II->eraseFromParent();
        }
        else
        {
            IGC_ASSERT(II->getIntrinsicID() == GenISAIntrinsic::GenISA_storerawvector_indexed ||
                       II->getIntrinsicID() == GenISAIntrinsic::GenISA_storeraw_indexed);
            Type* types[] =
            {
                newPtrTy,
                newVTy
            };

            Function* F = GenISAIntrinsic::getDeclaration(
                II->getParent()->getParent()->getParent(),
                GenISAIntrinsic::GenISA_storerawvector_indexed,
                types);

            Value* V;
            if (eTy->isPointerTy())
            {
                Type* intETy = Type::getIntNTy(*m_C, eTyBits);
                Type* newIntTy = VTy ? IGCLLVM::FixedVectorType::get(intETy, nelts) : intETy;
                V = Builder.CreatePtrToInt(II->getOperand(2), newIntTy);
                V = Builder.CreateBitCast(V, newVTy);
            }
            else
            {
                V = Builder.CreateBitCast(II->getOperand(2), newVTy);
            }

            Builder.CreateCall5(F, newPtr, II->getOperand(1), V, II->getOperand(3), II->getOperand(4));
            II->eraseFromParent();
        }
    return true;
}

bool VectorProcess::optimizeBitCast(BitCastInst* BC)
{
    bool change = false;
    Value* Src = BC->getOperand(0);
    Type* SrcTy = Src->getType();
    Type* Ty = BC->getType();

    if (Ty == SrcTy)
    {
        BC->replaceAllUsesWith(Src);
        return true;
    }

    // Only handle non-pointer bitcast
    if (isa<PointerType>(Ty) || isa<PointerType>(SrcTy))
    {
        return false;
    }

    for (Value::user_iterator UI = BC->user_begin(), UE = BC->user_end();
        UI != UE; ++UI)
    {
        if (BitCastInst * Inst = dyn_cast<BitCastInst>(*UI))
        {
            IRBuilder<> Builder(Inst);
            Type* Ty1 = Inst->getType();
            if (SrcTy == Ty1)
            {
                Inst->replaceAllUsesWith(Src);
            }
            else
            {
                BitCastInst* nBC = (BitCastInst*)Builder.CreateBitCast(Src, Ty1);
                Inst->replaceAllUsesWith(nBC);

                // Add nBC so it will be processed again.
                m_WorkList.push_back(nBC);
            }
            change = true;
        }
    }
    return change;
}

bool VectorProcess::runOnFunction(Function& F)
{
    CodeGenContext* cgCtx = nullptr;
    cgCtx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
    bool changed = false;
    m_DL = &F.getParent()->getDataLayout();
    m_C = &F.getContext();
    has_8Byte_A64_BS = cgCtx->platform.has8ByteA64ByteScatteredMessage();
    has_QW_BTS_GS = cgCtx->platform.hasQWGatherScatterBTSMessage();

    //  Adjust load/store layout by inserting bitcast.
    //  Those bitcasts should not be optimized away.
    for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
    {
        Instruction* inst = &*I;
        if (isa<LoadInst>(inst) || isa<StoreInst>(inst))
        {
            m_WorkList.push_back(inst);
        }
        else
            if (GenIntrinsicInst * intrin = dyn_cast<GenIntrinsicInst>(inst))
            {
                if (intrin->getIntrinsicID() == GenISAIntrinsic::GenISA_ldrawvector_indexed ||
                    intrin->getIntrinsicID() == GenISAIntrinsic::GenISA_ldraw_indexed ||
                    intrin->getIntrinsicID() == GenISAIntrinsic::GenISA_storerawvector_indexed ||
                    intrin->getIntrinsicID() == GenISAIntrinsic::GenISA_storeraw_indexed ||
                    intrin->getIntrinsicID() == GenISAIntrinsic::GenISA_PredicatedLoad ||
                    intrin->getIntrinsicID() == GenISAIntrinsic::GenISA_PredicatedStore)
                {
                    m_WorkList.push_back(inst);
                }
            }
    }

    for (unsigned i = 0; i < m_WorkList.size(); ++i)
    {
        if (reLayoutLoadStore(m_WorkList[i]))
        {
            changed = true;
        }
    }
    m_WorkList.clear();

    // To remove unnecessary bitcast
    if (changed)
    {
        for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
        {
            Instruction* inst = &*I;
            if (isa<BitCastInst>(inst))
            {
                m_WorkList.push_back(inst);
            }
        }

        bool doclean = false;
        for (unsigned i = 0; i < m_WorkList.size(); ++i)
        {
            if (BitCastInst * Inst = dyn_cast<BitCastInst>(m_WorkList[i]))
            {
                if (optimizeBitCast(Inst))
                {
                    doclean = true;
                }
            }
        }

        while (doclean)
        {
            // Given  b2 = bitcast A,  T2
            //        b1 = bitcast b2, T1
            // we say b1's level is 1, b2's level is 2.
            //
            // This pass, in theory, can have two-level dead bitcasts.
            // Therefore, we expect "while" will take three iterations at most. And
            // WorkList is the set of bitcasts,  which isn't expected to be big.
            doclean = false;
            for (unsigned i = 0; i < m_WorkList.size(); ++i)
            {
                if (m_WorkList[i] && m_WorkList[i]->use_empty())
                {
                    m_WorkList[i]->eraseFromParent();
                    m_WorkList[i] = NULL;
                    doclean = true;
                }
            }
        }

        m_WorkList.clear();
    }
    //DumpLLVMIR(cgCtx, "vectorprocess");
    return changed;
}

Value* VectorProcess::ProcessMergeValue(Instruction *Inst, Value* V, Type* NewTy, Type* NewIntEType, Type* NewIntTy) const
{
    // if V is a zero initializer, undef or poison value, we just need to create
    // corresponding value of NewTy.
    if (isa<ConstantAggregateZero>(V)) {
        if(IGCLLVM::FixedVectorType *NewVTy = dyn_cast<IGCLLVM::FixedVectorType>(NewTy))
            return ConstantAggregateZero::get(NewVTy);
        else
            return Constant::getNullValue(NewTy);
    }

    if (isa<PoisonValue>(V))
        return PoisonValue::get(NewTy);

    if (isa<UndefValue>(V))
        return UndefValue::get(NewTy);

    IRBuilder<> Builder(Inst);

    Type *Ty = V->getType();
    IGCLLVM::FixedVectorType* const VTy = dyn_cast<IGCLLVM::FixedVectorType>(Ty);
    uint32_t nelts = VTy ? int_cast<uint32_t>(VTy->getNumElements()) : 1;
    Type* eTy = VTy ? VTy->getElementType() : Ty;

    if (eTy->isPointerTy())
    {
        // cannot bitcast ptr to int; First, PtrToInt cast
        // then bitcast int (scalar or vector) to the new type.
        if (VTy)
        {
            // need a vector ptrtoint, scalarize:
            auto* oldV = V;
            V = UndefValue::get(NewIntTy);
            for (unsigned i = 0; i < nelts; ++i)
            {
                auto* EE = Builder.CreateExtractElement(oldV, i);
                auto* PTI = Builder.CreatePtrToInt(EE, NewIntEType);
                V = Builder.CreateInsertElement(V, PTI, i);
            }
        }
        else
        {
            V = Builder.CreatePtrToInt(V, NewIntTy);
        }
    }

    return Builder.CreateBitCast(V, NewTy);
}

//
// getInfo maps vector to the right messages. It assume that a vector
// can be mapped to more than one messages, and those messages may be
// different as long as the message returns exactly the same "packed form"
// of the vector.
//
// getInfo() initializes the array of struct (insts), which specifies
// the number of send instructions (or gathers/scatters visa instructions)
// needed to read/write this vector into vISA variable. The clients will
// access this array of struct directly after getInfo() call.
//
// VectorProcess() will change each vector load and store into a new vector
// load and store that can map exactly to these messages. getInfo() has
// the following agreement with VectorProcess():
//   1) If sizeof(Ty) >= 4 bytes, sizeof(Ty) must be multiple of 4 bytes.
//      And futhermore, the element type of 'Ty' if 'Ty" is a vector type
//      or 'Ty' if 'Ty' is a scalar type, must be either 4 bytes (DW) or
//      8 bytes (QW).
//   2) If sizeof(Ty) < 4 bytes, sizeof(Ty) must be either 1 byte or
//      2 bytes. The sizeof(Ty) cannot be 3 bytes!
// (Note that VectorMessage and VectorProcess must be in sync with regard
//  to this agreetment.)
//
void VectorMessage::getInfo(Type* Ty, uint64_t Align, bool useA32,
    bool forceByteScatteredRW)
{
    VectorType* VTy = dyn_cast<VectorType>(Ty);
    Type* eTy = VTy ? cast<VectorType>(VTy)->getElementType() : Ty;
    unsigned eltSize = Shader->GetScalarTypeSizeInRegister(eTy);
    unsigned nElts = VTy ? (unsigned)cast<IGCLLVM::FixedVectorType>(VTy)->getNumElements() : 1;
    // total bytes
    const unsigned TBytes = nElts * eltSize;

    // Per-channel Max Bytes (MB) that can be read/written by a single send inst
    unsigned MB;
    SIMDMode SM = Shader->m_SIMDSize;
    bool has_8B_A64_BS =
        Shader->m_Platform->has8ByteA64ByteScatteredMessage();
    bool has_8DW_A64_SM =
        Shader->m_Platform->has8DWA64ScatteredMessage();

    //
    // Set up default message and the data type of the message
    //
    MESSAGE_KIND defaultKind;
    VISA_Type    defaultDataType;
    if (Align < 4 || TBytes < 4 || forceByteScatteredRW)
    {
        if (forceByteScatteredRW)
        {
            IGC_ASSERT(useA32);
        }
        defaultKind = useA32
            ? MESSAGE_A32_BYTE_SCATTERED_RW
            : MESSAGE_A64_SCATTERED_RW;
        MB = useA32
            ? A32_BYTE_SCATTERED_MAX_BYTES
            : ((has_8B_A64_BS && eltSize == 8)
                ? A64_BYTE_SCATTERED_MAX_BYTES_8B
                : A64_BYTE_SCATTERED_MAX_BYTES);
        defaultDataType = ISA_TYPE_UB;

        // To make sure that vector and message match.
        IGC_ASSERT_MESSAGE((MB == eltSize || (MB > eltSize && nElts == 1)), "Internal Error: mismatch layout for vector");
    }
    else
    {
        defaultKind = useA32
            ? MESSAGE_A32_UNTYPED_SURFACE_RW
            : MESSAGE_A64_SCATTERED_RW;

        MB = useA32
            ? A32_UNTYPED_MAX_BYTES
            : ((has_8DW_A64_SM && SM == SIMDMode::SIMD8)
                ? A64_SCATTERED_MAX_BYTES_8DW_SIMD8
                : A64_SCATTERED_MAX_BYTES_4DW);

        bool allowQWMessage = !useA32 && eltSize == 8 && Align >= 8U;

        defaultDataType = (eltSize == 8) ? ISA_TYPE_UQ : ISA_TYPE_UD;
        //To make sure that send returns the correct layout for vector.
        IGC_ASSERT_MESSAGE((eltSize == 4 /* common */ || allowQWMessage /* A64, QW */), "Internal Error: mismatch layout for vector");
    }

    MESSAGE_KIND kind = defaultKind;
    VISA_Type    dataType = defaultDataType;
    unsigned bytes = TBytes;
    size_t i = 0;
    for (; bytes >= MB; ++i, bytes -= MB)
    {
        IGC_ASSERT(i < (sizeof(insts) / sizeof(*insts)));
        insts[i].startByte = (uint16_t)(TBytes - bytes);
        insts[i].kind = kind;
        insts[i].blkType = dataType;
        insts[i].blkInBytes = (uint16_t)CEncoder::GetCISADataTypeSize(dataType);
        IGC_ASSERT(insts[i].blkInBytes);
        insts[i].numBlks = MB / insts[i].blkInBytes;
    }

    // Process the remaining elements if any. It could have at most
    // two separate sends. For example, assuming the remaining bytes
    // are for <7 x i32> and it is for A64 SIMD8 with align >=4; thus
    // we will need two sends: one for the first <4 x i32> and the
    // second for  the remaining <3 x i32>.
    if (MB == A64_SCATTERED_MAX_BYTES_8DW_SIMD8)
    {   // MB == 32 bytes
        unsigned MB2 = A64_SCATTERED_MAX_BYTES_8DW_SIMD8 / 2; // 16 bytes
        if (bytes > MB2)
        {
            IGC_ASSERT(i < (sizeof(insts) / sizeof(*insts)));
            insts[i].startByte = (uint16_t)(TBytes - bytes);
            insts[i].kind = kind;
            insts[i].blkInBytes = (uint16_t)CEncoder::GetCISADataTypeSize(dataType);
            IGC_ASSERT(insts[i].blkInBytes);
            insts[i].numBlks = MB2 / insts[i].blkInBytes;
            ++i;
            bytes -= MB2;
        }
    }

    if (bytes > 0)
    {
        if (Align >= 4)
        {
            if (!useA32 && eltSize == 4 && bytes == 12)
            {
                kind = MESSAGE_A64_UNTYPED_SURFACE_RW;
            }
        }

        IGC_ASSERT(i < (sizeof(insts) / sizeof(*insts)));
        insts[i].startByte = (uint16_t)(TBytes - bytes);
        insts[i].kind = kind;
        insts[i].blkType = dataType;
        insts[i].blkInBytes = (uint16_t)CEncoder::GetCISADataTypeSize(dataType);
        IGC_ASSERT(insts[i].blkInBytes);
        insts[i].numBlks = (uint16_t)bytes / insts[i].blkInBytes;
        ++i;
    }

    numInsts = i;
    IGC_ASSERT_MESSAGE(numInsts <= VECMESSAGEINFO_MAX_LEN, "Vector's size is too big, increase MAX_VECMESSAGEINFO_LEN to fix it!");
    IGC_ASSERT_MESSAGE(numInsts <= (sizeof(insts) / sizeof(*insts)), "Vector's size is too big, increase MAX_VECMESSAGEINFO_LEN to fix it!");
}

void VectorMessage::getLSCInfo(llvm::Type* Ty, uint64_t Align, CodeGenContext* ctx, bool useA32, bool transpose)
{
    IGC_ASSERT(nullptr != ctx);
    IGC_ASSERT(nullptr != Shader);

    IGCLLVM::FixedVectorType* VTy = dyn_cast<IGCLLVM::FixedVectorType>(Ty);
    Type* eTy = VTy ? VTy->getContainedType(0) : Ty;
    unsigned eltSize = Shader->GetScalarTypeSizeInRegister(eTy);
    unsigned nElts = VTy ? (unsigned)VTy->getNumElements() : 1;
    // total bytes
    const unsigned TBytes = nElts * eltSize;
    char TRANS_VEC_SIZE[8] = { 1, 2, 3, 4, 8, 16, 32, 64 };
    MESSAGE_KIND kind = useA32
        ? MESSAGE_A32_LSC_RW
        : MESSAGE_A64_LSC_RW;

    VISA_Type dataType = GetType(Ty, ctx);
    uint16_t blkInBytes = (uint16_t)CEncoder::GetCISADataTypeSize(dataType);

    // Per-channel Max Bytes (MB) that can be read/written by a single send inst
    const unsigned int numLanesForSIMDSize = numLanes(Shader->m_SIMDSize);
    IGC_ASSERT(numLanesForSIMDSize);
    unsigned int MB = (8 * ctx->platform.getGRFSize()) / numLanesForSIMDSize;
    if (Align < 4 || (eltSize == 8 && Align < 8)) {
        MB = eltSize;
    }

    size_t i = 0;
    if (transpose)
    {
        unsigned bytes = TBytes;
        for (int j = 0; j < 8; j++)
        {
            const unsigned int denominator = blkInBytes * TRANS_VEC_SIZE[7 - j];
            IGC_ASSERT(denominator);

            if (bytes % denominator == 0)
            {
                IGC_ASSERT(i < (sizeof(insts) / sizeof(*insts)));
                insts[i].startByte = (uint16_t)(TBytes - bytes);
                insts[i].kind = kind;
                insts[i].blkType = dataType;
                insts[i].blkInBytes = blkInBytes;
                insts[i].numBlks = TRANS_VEC_SIZE[7 - j];
                bytes -= insts[i].numBlks * blkInBytes;
                i++;
                break;
            }
            else //
            {
                if (bytes / denominator != 0)
                {
                    IGC_ASSERT(i < (sizeof(insts) / sizeof(*insts)));
                    insts[i].startByte = (uint16_t)(TBytes - bytes);
                    insts[i].kind = kind;
                    insts[i].blkType = dataType;
                    insts[i].blkInBytes = blkInBytes;
                    insts[i].numBlks = TRANS_VEC_SIZE[7 - j];
                    bytes -= insts[i].numBlks * blkInBytes;
                    i++;
                }  // else j++;
            }
        }
        IGC_ASSERT(bytes == 0);
    }
    else
    {
        unsigned bytes = TBytes;
        for (; bytes >= MB; ++i, bytes -= MB)
        {
            insts[i].startByte = (uint16_t)(TBytes - bytes);
            insts[i].kind = kind;
            insts[i].blkType = dataType;
            insts[i].blkInBytes = (uint16_t)CEncoder::GetCISADataTypeSize(dataType);
            IGC_ASSERT(insts[i].blkInBytes);
            insts[i].numBlks = MB / insts[i].blkInBytes;
        }

        if (bytes > 0)
        {
            insts[i].startByte = (uint16_t)(TBytes - bytes);
            insts[i].kind = kind;
            insts[i].blkType = dataType;
            insts[i].blkInBytes = (uint16_t)CEncoder::GetCISADataTypeSize(dataType);
            IGC_ASSERT(insts[i].blkInBytes);
            insts[i].numBlks = (uint16_t)bytes / insts[i].blkInBytes;
            ++i;
        }
    }

    numInsts = i;
    IGC_ASSERT_MESSAGE(numInsts <= VECMESSAGEINFO_MAX_LEN, "Vector's size is too big, increase MAX_VECMESSAGEINFO_LEN to fix it!");
    IGC_ASSERT_MESSAGE(numInsts <= (sizeof(insts) / sizeof(*insts)), "Vector's size is too big, increase MAX_VECMESSAGEINFO_LEN to fix it!");
}

VectorMessage::VectorMessage(EmitPass* emitter) : Shader(emitter->m_currShader)
{
    numInsts = 0;
}
